if (!require("ggplot2")) install.packages('ggplot2') # for data visualization
if (!require("tidyverse")) install.packages('tidyverse') # dealing with dataframes
if (!require("tidylog")) install.packages('tidylog') # logs for tidyverse
if (!require("plotrix")) install.packages('plotrix') # for the standard error function
songsDataset <- read.csv('songs.csv') # read the dataset CSV file
songsDataset <- na.omit(songsDataset) # to remove NA values if there are (makes no change)
# new categorical column "explicit_text" to translate the binary column "explicit"
songsDataset <- songsDataset %>%
mutate(explicit_text = case_when(
explicit == 0 ~ "Implicit",
explicit == 1 ~ "Explicit",
))
# summary table that holds descriptive statistics about the variable popularity:
# mean, sample size, standard deviation, standard error of the sample and median
# for both groups - Explicit songs and Implicit songs
stats_popularity_per_type <- songsDataset %>%
group_by(explicit_text) %>%
summarise(popularity_mean = mean(popularity),
n=n(),
std = sd(popularity),
sterr = std.error(popularity),
median = median(popularity))
# simultanious qqplot for the variable popularity in both Implicit & Explicit songs
ggplot(songsDataset) +
geom_qq(aes(sample = popularity, color=explicit_text), size=1) +
geom_qq_line(aes(sample = popularity)) +
facet_wrap(~explicit_text, ncol = 6, shrink = TRUE) +
guides(color='none') +
labs(x='Theoretical Z score', y='Popularity',
title = 'QQplot for the variable popularity in both Implicit & Explicit songs',
subtitle = 'To check the noramality assumption on each group') +
theme(plot.title = element_text(color="grey20",size=16, face="bold.italic"),
plot.subtitle = element_text(color="grey20", face="italic"))
ggplot(songsDataset, aes(x=popularity, fill = explicit_text)) +
geom_density(alpha=0.5) +
geom_vline(data = stats_popularity_per_type,
aes(xintercept = popularity_mean), linetype="dashed") +
geom_text(data = stats_popularity_per_type,
aes(x = 87.5, y = 0.03, label = paste('N:', n), color = explicit_text),
size = 4) +
geom_text(data = stats_popularity_per_type,
aes(x = 87.5, y = 0.025, label = paste('Mean:',round(popularity_mean,2)), color = explicit_text),
size = 4) +
geom_text(data = stats_popularity_per_type,
aes(x = 87.5, y = 0.02, label = paste('Median:',round(median,2)), color = explicit_text),
size = 4) +
geom_text(data = stats_popularity_per_type,
aes(x = 87.5, y = 0.015, label = paste('Std:',round(std,2)), color = explicit_text),
size = 4) +
geom_text(data = stats_popularity_per_type,
aes(x = 87.5, y = 0.01, label = paste('Sterr:',round(sterr,2)), color = explicit_text),
size = 4) +
facet_wrap(~explicit_text, ncol=1) +
guides(color='none', fill='none') +
scale_fill_manual(values = c('cornflowerblue', 'darkgoldenrod')) +
scale_color_manual(values = c('cornflowerblue', 'darkgoldenrod')) +
labs(title = 'Is there a difference in popularity between Explicit and Implicit songs?',
subtitle = 'each vertical line represents the sample mean',
y='Density',x='Popularity grade') +
theme(plot.title = element_text(color="grey25",size=16, face="bold.italic"),
plot.subtitle = element_text(color="grey25", face="italic"))